#########################################################################   
#               INFO                                                    #   
#########################################################################  

# PROJECT: Gender Quotas + Party strategy
# PURPOSE: Analysis of candidate quality and individual-level regressions
# CREATED: August 2020 by Alexandra Blackman
# INPUTS:  quotas_list_clean.RData, quotas_lecs_clean.RData, quotas_lecs_top2_party_clean.RData

#########################################################################   
#               SETUP                                                   #   
######################################################################### 

rm(list = ls()) 
setwd("~/Desktop/replication_what_men_want/")

######## PACKAGES

need <- c("plyr", "tidyr", "dplyr", "ggplot2", "openxlsx", "stargazer", "sjPlot","tibble",
          "lmtest", "multiwayvcov", "viridis", "broom","xtable", "ggpubr") # packages needed
have <- need %in% rownames(installed.packages()) # installed packages
if(any(!have)) install.packages(need[!have]) # install missing packages
invisible(lapply(need, library, character.only=T)) # load needed packages

# options(scipen=999)

#### CLUSTERING FUNCTION

cluster.se <- function(model, cluster){
  coeftest(model, vcov=function(x) cluster.vcov(x, cluster))[,2]
}

#########################################################################   
#               LOAD SUMMARIZING FUNCTIONS                              #   
#########################################################################

## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summarized
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=T,
                      conf.interval=.95, .drop=TRUE) {
  library(plyr)
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=T) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- ddply(data, groupvars, .drop=.drop,
                 .fun = function(xx, col) {
                   c(N    = length2(xx[[col]], na.rm=na.rm),
                     mean = mean   (xx[[col]], na.rm=na.rm),
                     sd   = sd     (xx[[col]], na.rm=na.rm)
                   )
                 },
                 measurevar
  )
  
  # Rename the "mean" column    
  datac <- plyr::rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}


#########################################################################   
#               GGPLOT PARAMETERS                                       #   
######################################################################### 

# Colors
vid_low <- "#481567FF" #MHL
vid_med <- "#29AF7FFF" #FHL

pd <- position_dodge(0.1) # move them .05 to the left and right  


# Theme with lines for coeff plots
lecs_theme_lines <- theme_bw() + 
  theme(text = element_text(size = 10),
        title = element_text(face = "bold", color = "black", size = 14), 
        plot.subtitle = element_text(face = "italic", color = "grey40", size = 14),
        axis.title = element_text(face = "bold", color = "black", size = 12), 
        legend.title = element_text(face = "bold", color = "black", size = 12),
        legend.text = element_text(size = 10),
        axis.text.x = element_text(size = 10),
        plot.caption = element_text(face = "plain", color = "grey40", size = 10,
                                    hjust = 0, margin = margin(t = 15, r = 0, b = 0,
                                                               l = 0)),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)),
        axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0),
                                    angle = 0, vjust = .5),
        panel.background = element_blank()
  )

#########################################################################   
#               LOAD AND CLEAN DATA                                     #   
#########################################################################

##### LOAD DATA

# List-level election data ["list"]
load("data/quotas_list_clean.RData")

# LECS respondent-level data ["lecs"]
load("data/quotas_lecs_clean.RData")

# LECS top 2 candidates respondent-level data ["lecs_top2_party"]
load("data/quotas_lecs_top2_party_clean.RData")



#########################################################################   
#               SUBSET DATA                                             #   
#########################################################################

# List ran in 2014
lecs_ran2014 <- lecs %>% filter(!is.na(above_mean_2014_parl))
nrow(lecs_ran2014) # 1173

# List ran in 2014 top-2 candidates
lecs_ran2014_top2 <- lecs_top2_party %>% filter(!is.na(above_mean_2014_parl))
nrow(lecs_ran2014_top2) # 301

# Top-2 female candidates in parties
lecs_top_women <- lecs_top2_party[lecs_top2_party$cand_fem==1,]
nrow(lecs_top_women) # 165

# Top-2 male candidates in parties
lecs_top_men <- lecs_top2_party[lecs_top2_party$cand_fem==0,]
nrow(lecs_top_men) # 181


#########################################################################   
#                 FIGURES/TABLES FOR THE PAPER                          #   
######################################################################### 

###################################################################################   
#       CONTRAST QUALITY OF TOP M/F CANDIDATES WITHIN PARTY LISTS   (FIGURE 5)    #   
###################################################################################

##### TOP MEN VERSUS TOP FEMALE (QUALITY MEASURES)
nrow(lecs_top2_party) #346

# Transform to long
lecs_top2_long <- gather(lecs_top2_party, measure, value, c(edu_hig:rcd), factor_key=TRUE) %>% 
  filter(measure !=  "mun_coun") %>% filter(measure !=  "ran_parl") %>% filter(measure !=  "prev_admin") %>%
  filter(measure !=  "party_years") %>% filter(measure !=  "party_member") %>% 
  filter(measure !=  "rcd") %>% filter(measure !=  "act_index") %>% filter(measure !=  "mem_index")
lecs_top2_long <- mutate(lecs_top2_long, gender = ifelse(cand_fem == 1, "Female", "Male"))

# Recode variables
lecs_top2_long$gender <- factor(lecs_top2_long$gender)
lecs_top2_long$measure <- factor(lecs_top2_long$measure, levels= c("edu_hig","ambition_yes",
                                                                   "know_seats_correct",
                                                                   "skill_index","exp_pol"))

top2_summ2 <- summarySE(lecs_top2_long, measurevar="value", groupvars=c("measure","gender"), na.rm= T)
top2_summ2

##### CREATE GRAPH

ggplot(top2_summ2, aes(x=measure, y=value, fill=gender)) + 
  geom_bar(position=position_dodge(), stat="identity") +
  geom_errorbar(aes(ymin=value-ci, ymax=value+ci),
                width=.2,                    # Width of the error bars
                position=position_dodge(.9),  col = "grey70")+
  ylab("Outcome\nmean") + 
  scale_fill_manual(values = c(vid_low, vid_med)) + 
  scale_y_continuous(limits = c(0,2), expand = c(0,0) ) +
  scale_x_discrete(labels=c("Education","Ambition","Knowledge",
                            "Skills","Experience")) +
  # ggtitle(title = "Contrasting top male and top female within party lists")
  labs(x = "Candidate quality measure", fill = "Gender", 
       caption = "Graph shows averages of candidate quality, comparing the first- and second-ranked candidates on interviewed party lists (n = 346).\nData comes from the Authors' Local Election Candidate Survey (LECS) of candidates for the 2018 municipal elections.") +
  lecs_theme_lines
ggsave("plot_top2_quality_gender.pdf", plot = last_plot(),
       path = "figures", limitsize = F,
       width = 10, height = 5.5)


##################################################################################   
#   CONTRAST EMBEDDEDNESS OF TOP M & F CANDIDATES WITHIN PARTY LISTS  (TABLE 3)  #   
################################################################################## 

table(lecs_top2_party$cand_fem)
lecs_top2_party <- mutate(lecs_top2_party, gender = ifelse(cand_fem == 1, "Female", "Male"))


###### TOP MEN VERSUS TOP FEMALE (MALE DOMINANCE MEASURES) -- LATEX TABLE 

# create tidy table with t-tests for variables of interest
top2_party_t_tests <- bind_rows(tidy(t.test(exp_pol ~ gender, lecs_top2_party)),
                                tidy(t.test(act_index ~ gender, lecs_top2_party)),
                                tidy(t.test(mem_index ~ gender, lecs_top2_party)),
                                tidy(t.test(rcd ~ gender, lecs_top2_party)),
                                tidy(t.test(party_member ~ gender, lecs_top2_party)),
                                tidy(t.test(party_years ~ gender, lecs_top2_party))
)

# prepare table for latex estimate1 = women & estimate2 = men
top2_party_t_tests <- top2_party_t_tests %>%
  add_column(variable = c("Political Experience",
                          "Activities Index",
                          "Membership Index",
                          "RCD Member",
                          "Party Member",
                          "Years in Party")) %>%
  dplyr::select(variable,estimate1,estimate2,estimate,p.value) %>%
  add_row(variable = "Observations",
          estimate1 = 165,
          estimate2 = 181) %>%
  mutate_at(vars(estimate1, estimate2, estimate), funs(round(., 3))) %>%
  mutate_at(vars(p.value), funs(round(., 3))) %>%
  mutate(estimate = -estimate) %>%
  dplyr::rename("Variable" = variable,
                "Mean (Women)" = estimate1,
                "Mean (Men)" = estimate2,
                "Difference" = estimate,
                "p-value" = p.value)

stargazer(top2_party_t_tests,
          type="latex",
          summary = F,
          rownames = F,
          single.row = T,
          out = 'tables/top2_party_t_tests.tex')


#########################################################################   
#       CONTRAST QUALITY OF TOP F CANDIDATES WITHIN PARTY LISTS         #   
######################################################################### 
###### TOP F on FHL VERSUS MHL (QUALITY MEASURES) -- TABLE A.4 (APPENDIX)

nrow(lecs_top_women) #165
lecs_top_women$fhl2 <- factor(ifelse(lecs_top_women$FHL == 1, "FHL", "MHL"))
table(lecs_top_women$fhl2)

###### TOP F on FHL VERSUS MHL (QUALITY MEASURES) -- LATEX TABLE 

# create tidy table with t-tests for variables of interest
top_women_t_tests <- bind_rows(tidy(t.test(edu_hig ~ fhl2, lecs_top_women)),
                               tidy(t.test(ambition_yes ~ fhl2, lecs_top_women)),
                               tidy(t.test(know_seats_correct ~ fhl2, lecs_top_women)),
                               tidy(t.test(skill_index ~ fhl2, lecs_top_women)),
                               tidy(t.test(exp_pol ~ fhl2, lecs_top_women)),
                               tidy(t.test(act_index ~ fhl2, lecs_top_women)),
                               tidy(t.test(mem_index ~ fhl2, lecs_top_women)),
                               tidy(t.test(rcd ~ fhl2, lecs_top_women)),
                               tidy(t.test(party_member ~ fhl2, lecs_top_women)),
                               tidy(t.test(party_years ~ fhl2, lecs_top_women))
)

# prepare table for latex estimate1 = FHL & estimate2 = MHL
top_women_t_tests <- top_women_t_tests %>%
  add_column(variable = c("University Education",
                          "Political Ambition",
                          "Political Knowledge",
                          "Skill Index",
                          "Political Experience",
                          "Activities Index",
                          "Membership Index",
                          "RCD Member",
                          "Party Member",
                          "Years in Party")) %>%
  dplyr::select(variable,estimate1,estimate2,estimate,p.value) %>%
  add_row(variable = "Observations",
          estimate1 = 79,
          estimate2 = 86) %>%
  mutate_at(vars(estimate1, estimate2, estimate), funs(round(., 3))) %>%
  mutate_at(vars(p.value), funs(round(., 3))) %>%
  mutate(estimate = -estimate) %>%
  dplyr::rename("Variable" = variable,
                "Mean (FHL)" = estimate1,
                "Mean (MHL)" = estimate2,
                "Difference" = estimate,
                "p-value" = p.value)

stargazer(top_women_t_tests,
          type="latex",
          summary = F,
          rownames = F,
          single.row = T,
          out = 'tables/top_women_t_tests.tex')




#########################################################################   
#               PREDICT PROBABILITY OF RANK (FIGURE 4)                  #   
######################################################################### 

#### OLS MODELS

ols <- list()

ols[[1]] <- lm(list_head ~ cand_fem*above_mean_2014_parl + cand_fem*sd_log_pop + 
                 sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                 sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
               data = lecs_ran2014_top2)

ols[[2]] <- lm(list_head ~ above_mean_2014_parl + sd_log_pop + 
                 sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                 sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
               data = lecs_top_men)

ols[[3]] <- lm(list_head ~ sd_log_pop + above_mean_2014_parl + 
                 sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                 sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
               data = lecs_top_women)

names(ols) <- c("All", "Men", "Women")


ols.cl <- list()
ols.cl[[1]] <- cluster.se(ols[[1]], lecs_ran2014_top2$rand_id)
ols.cl[[2]] <- cluster.se(ols[[2]], lecs_top_men$rand_id)
ols.cl[[3]] <- cluster.se(ols[[3]], lecs_top_women$rand_id)

stargazer(ols,
          out = "tables/fhl_individual_ols.tex",
          title = "Individual-level predictors of being chosen as the list head",
          label = "fhl.ols.candidates",
          table.placement = "H",
          type = "text", 
          omit = c("list_class", "gov_en", "mun_en", "list_number"), 
          column.labels = names(ols), 
          se = ols.cl,
          omit.stat = "f", 
          font.size = "scriptsize",
          add.lines = list(c("Governorate FE", rep("Y", length(ols)))),
          notes.align = "l",
          dep.var.labels = "Probability of Heading the List",
          covariate.labels = c("Candidate is female",
                               "Above mean relative strength in 2014",
                               "Log population",
                               "Urbanization rate",
                               "Unemployment rate",
                               "Female turnout in 2014",
                               "Age",
                               "University education",
                               "Ambition to run for Parl.",
                               "Skill index",
                               "Knowledge test correct",
                               "Political experience",
                               "Female x Strength",
                               "Female x Log population",
                               "Constant"),
          notes = c("OLS models with standard errors clustered by municipality and standardized", 
                    "continuous variables (log population, urbanization, unemployment, female turnout,",
                    "and age). Individual-level measures come from the Local Election Candidate (LECS)", 
                    "survey, and analysis includes only first- and second-ranked respondents in our sample."))

#### LOGIT MODELS

log <- list()

log[[1]] <- glm(list_head ~ cand_fem*above_mean_2014_parl + cand_fem*sd_log_pop + 
                  sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                  sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
                data = lecs_ran2014_top2, family = "binomial")

log[[2]] <- glm(list_head ~ above_mean_2014_parl + sd_log_pop + 
                  sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                  sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
                data = lecs_top_men, family = "binomial")

log[[3]] <- glm(list_head ~ sd_log_pop + above_mean_2014_parl + 
                  sd_pop_com_per + sd_emp_unemploy_rate + sd_turnout_per_pop_f_2014_parl +
                  sd_age + edu_hig + ambition_yes + skill_index  + know_seats_correct + exp_pol + list_class +  gov_en, 
                data = lecs_top_women, family = "binomial")

names(log) <- c("All", "Men", "Women")


log.cl <- list()
log.cl[[1]] <- cluster.se(log[[1]], lecs_ran2014_top2$rand_id)
log.cl[[2]] <- cluster.se(log[[2]], lecs_top_men$rand_id)
log.cl[[3]] <- cluster.se(log[[3]], lecs_top_women$rand_id)

stargazer(log,
          out = "tables/fhl_individual_log.tex",
          title = "Individual-level predictors of being chosen as the list head",
          label = "fhl.log.candidates",
          table.placement = "H",
          type = "text", 
          omit = c("list_class", "gov_en", "mun_en", "list_number"), 
          column.labels = names(log), 
          se = log.cl,
          omit.stat = "f", 
          font.size = "scriptsize",
          add.lines = list(c("Governorate FE", rep("Y", length(log)))),
          notes.align = "l",
          dep.var.labels = "Probability of Heading the List",
          covariate.labels = c("Candidate is female",
                               "Above mean relative strength in 2014",
                               "Log population",
                               "Urbanization rate",
                               "Unemployment rate",
                               "Female turnout in 2014",
                               "Age",
                               "University education",
                               "Ambition to run for Parl.",
                               "Skill index",
                               "Knowledge test correct",
                               "Political experience",
                               "Female x Strength",
                               "Female x Log population",
                               "Constant"),
          notes = c("Logit models with standard errors clustered by municipality and standardized", 
                    "continuous variables (log population, urbanization, unemployment, female turnout,",
                    "and age). Individual-level measures come from the Local Election Candidate (LECS)", 
                    "survey, and analysis includes only first- and second-ranked respondents in our sample."))


##### PLOTS

# Extract data for interaction plot
s <- plot_model(log[[1]], type = "pred", terms = c("above_mean_2014_parl", "cand_fem")) 
s
sdat <- data.frame(x = s$data$x, pred = s$data$predicted, 
                   ci_low = s$data$conf.low, ci_high = s$data$conf.high) %>% 
  dplyr::mutate(Candidate = rep(c("Male", "Female"),2),
                x = ifelse(x == 1, "Above mean", "Below mean")) %>%
  dplyr::mutate(x = factor(x, levels = c("Below mean", "Above mean")))

p <- plot_model(log[[1]], type = "pred", terms = c("sd_log_pop [all]", "cand_fem")) 
p
pdat <- data.frame(x = p$data$x, pred = p$data$predicted, 
                   ci_low = p$data$conf.low, ci_high = p$data$conf.high)

pdat <- data.frame(x = p$data$x, pred = p$data$predicted, 
                   ci_low = p$data$conf.low, ci_high = p$data$conf.high) %>% 
  dplyr::mutate(Candidate = rep(c("Male", "Female"),nrow(pdat)/2))

# Strength plot
splot <- ggplot(sdat, aes(x = x, y = pred, ymin = ci_low, ymax = ci_high, 
                          color = Candidate, fill = Candidate)) +
  geom_point(position = position_dodge(width = .1), size = 3) + 
  geom_errorbar(position = position_dodge(width = .1), width = .1) +
  scale_color_manual(values = c(vid_low, vid_med)) +
  scale_fill_manual(values = c(vid_low, vid_med)) +
  scale_y_continuous(limits = c(0, 1), expand = c(0,0),  labels = scales::percent_format(accuracy = 1)) +
  labs(y = "Probability\nof being\nlist head", x = "Party's relative strength") + 
  lecs_theme_lines

# Population plot
pplot <- ggplot(pdat, aes(x = x, y = pred, ymin = ci_low, ymax = ci_high, 
                          color = Candidate, fill = Candidate)) +
  geom_ribbon(alpha = .1, color = NA) +
  geom_line(size = 1) +
  scale_color_manual(values = c(vid_low, vid_med)) +
  scale_fill_manual(values = c(vid_low, vid_med)) +
  scale_y_continuous(limits = c(0, 1), expand = c(0,0),  labels = scales::percent_format(accuracy = 1)) +
  labs(y = "    ", x = "Log population (standard deviations)") + 
  lecs_theme_lines

# Combine plots
combo <- ggarrange(splot, pplot, 
                   ncol = 2, nrow = 1)

annotate_figure(combo,
                bottom = text_grob("Predicted values based on a logit model controlling for candidate quality measures, including age, education, political experience, skills,\nknowledge, and ambition, as well as party affiliation, unemployment rate, urbanization rate, female turnout in 2014, and governorate\nfixed effects. Sample is restricted to the top two LECS respondents who ran with a party that competed in 2014. Graph\nshows 95 percent confidence intervals.", size = 11) )

ggsave("fhl_individual.pdf", plot = last_plot(),
       path = "figures/", limitsize = F,
       width = 11, height = 4)


